{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "from os.path import dirname, realpath, join\n",
    "base_dir = dirname(dirname(os.getcwd()))\n",
    "\n",
    "import pandas as pd\n",
    "from os.path import join\n",
    "sys.path.insert(0, base_dir)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from config_path import PROSTATE_DATA_PATH, PLOTS_PATH, DATA_PATH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "ge_file = join(PROSTATE_DATA_PATH,'processed/outputs_su2c_tcga_all_samples_n=980_rsem_results_filtered.tsv')\n",
    "ge_data = pd.read_csv(ge_file, sep=',',index_col=0 )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "coding_gene_file = join(DATA_PATH, 'genes/HUGO_genes/protein-coding_gene_with_coordinate_minimal.txt')\n",
    "coding_genes = pd.read_csv(coding_gene_file, sep='\\t', header=None)\n",
    "coding_genes.columns =['ch', 'start', 'end', 'gene']\n",
    "coding_genes_names = coding_genes[['gene']]\n",
    "coding_genes_names =coding_genes_names.set_index('gene')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A1BG</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A1CF</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A2M</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A2ML1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A3GALT2</th>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: [A1BG, A1CF, A2M, A2ML1, A3GALT2]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "coding_genes_names.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>transcript_id(s)</th>\n",
       "      <th>length</th>\n",
       "      <th>effective_length</th>\n",
       "      <th>expected_count</th>\n",
       "      <th>TPM</th>\n",
       "      <th>FPKM</th>\n",
       "      <th>sample</th>\n",
       "      <th>sequencing_type</th>\n",
       "      <th>tpm_id</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>ENSG00000000003.14_2_TSPAN6</th>\n",
       "      <td>ENST00000373020.8_1_TSPAN6-201,ENST00000494424...</td>\n",
       "      <td>1872.50</td>\n",
       "      <td>1630.51</td>\n",
       "      <td>14.0</td>\n",
       "      <td>0.29</td>\n",
       "      <td>0.20</td>\n",
       "      <td>MO_1008-Tumor_Dura</td>\n",
       "      <td>tcap</td>\n",
       "      <td>MO_1008-Tumor_Dura_tcap</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ENSG00000000005.6_3_TNMD</th>\n",
       "      <td>ENST00000373031.5_2_TNMD-201,ENST00000485971.1...</td>\n",
       "      <td>873.50</td>\n",
       "      <td>631.57</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>MO_1008-Tumor_Dura</td>\n",
       "      <td>tcap</td>\n",
       "      <td>MO_1008-Tumor_Dura_tcap</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ENSG00000000419.12_3_DPM1</th>\n",
       "      <td>ENST00000371582.8_2_DPM1-201,ENST00000371584.8...</td>\n",
       "      <td>969.60</td>\n",
       "      <td>727.62</td>\n",
       "      <td>514.0</td>\n",
       "      <td>24.09</td>\n",
       "      <td>16.65</td>\n",
       "      <td>MO_1008-Tumor_Dura</td>\n",
       "      <td>tcap</td>\n",
       "      <td>MO_1008-Tumor_Dura_tcap</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ENSG00000000457.14_4_SCYL3</th>\n",
       "      <td>ENST00000367770.5_2_SCYL3-201,ENST00000367771....</td>\n",
       "      <td>3087.03</td>\n",
       "      <td>2845.03</td>\n",
       "      <td>155.0</td>\n",
       "      <td>1.86</td>\n",
       "      <td>1.28</td>\n",
       "      <td>MO_1008-Tumor_Dura</td>\n",
       "      <td>tcap</td>\n",
       "      <td>MO_1008-Tumor_Dura_tcap</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ENSG00000000460.17_6_C1orf112</th>\n",
       "      <td>ENST00000286031.10_4_C1orf112-201,ENST00000359...</td>\n",
       "      <td>3064.19</td>\n",
       "      <td>2822.19</td>\n",
       "      <td>87.0</td>\n",
       "      <td>1.05</td>\n",
       "      <td>0.73</td>\n",
       "      <td>MO_1008-Tumor_Dura</td>\n",
       "      <td>tcap</td>\n",
       "      <td>MO_1008-Tumor_Dura_tcap</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                transcript_id(s)  \\\n",
       "gene_id                                                                            \n",
       "ENSG00000000003.14_2_TSPAN6    ENST00000373020.8_1_TSPAN6-201,ENST00000494424...   \n",
       "ENSG00000000005.6_3_TNMD       ENST00000373031.5_2_TNMD-201,ENST00000485971.1...   \n",
       "ENSG00000000419.12_3_DPM1      ENST00000371582.8_2_DPM1-201,ENST00000371584.8...   \n",
       "ENSG00000000457.14_4_SCYL3     ENST00000367770.5_2_SCYL3-201,ENST00000367771....   \n",
       "ENSG00000000460.17_6_C1orf112  ENST00000286031.10_4_C1orf112-201,ENST00000359...   \n",
       "\n",
       "                                length  effective_length  expected_count  \\\n",
       "gene_id                                                                    \n",
       "ENSG00000000003.14_2_TSPAN6    1872.50           1630.51            14.0   \n",
       "ENSG00000000005.6_3_TNMD        873.50            631.57             0.0   \n",
       "ENSG00000000419.12_3_DPM1       969.60            727.62           514.0   \n",
       "ENSG00000000457.14_4_SCYL3     3087.03           2845.03           155.0   \n",
       "ENSG00000000460.17_6_C1orf112  3064.19           2822.19            87.0   \n",
       "\n",
       "                                 TPM   FPKM              sample  \\\n",
       "gene_id                                                           \n",
       "ENSG00000000003.14_2_TSPAN6     0.29   0.20  MO_1008-Tumor_Dura   \n",
       "ENSG00000000005.6_3_TNMD        0.00   0.00  MO_1008-Tumor_Dura   \n",
       "ENSG00000000419.12_3_DPM1      24.09  16.65  MO_1008-Tumor_Dura   \n",
       "ENSG00000000457.14_4_SCYL3      1.86   1.28  MO_1008-Tumor_Dura   \n",
       "ENSG00000000460.17_6_C1orf112   1.05   0.73  MO_1008-Tumor_Dura   \n",
       "\n",
       "                              sequencing_type                   tpm_id  \n",
       "gene_id                                                                 \n",
       "ENSG00000000003.14_2_TSPAN6              tcap  MO_1008-Tumor_Dura_tcap  \n",
       "ENSG00000000005.6_3_TNMD                 tcap  MO_1008-Tumor_Dura_tcap  \n",
       "ENSG00000000419.12_3_DPM1                tcap  MO_1008-Tumor_Dura_tcap  \n",
       "ENSG00000000457.14_4_SCYL3               tcap  MO_1008-Tumor_Dura_tcap  \n",
       "ENSG00000000460.17_6_C1orf112            tcap  MO_1008-Tumor_Dura_tcap  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ge_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# filtered_ge['HUGO_gene'] =  [x[-1] for x in filtered_ge.index.str.split('_', expand=False) ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[x[-1] for x in ge_data.head().index.str.split('_', expand=False) ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['ENSG00000000003.14', '2', 'TSPAN6'],\n",
       " ['ENSG00000000005.6', '3', 'TNMD'],\n",
       " ['ENSG00000000419.12', '3', 'DPM1'],\n",
       " ['ENSG00000000457.14', '4', 'SCYL3'],\n",
       " ['ENSG00000000460.17', '6', 'C1orf112']]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ge_data.head().index.str.split('_', expand=False).tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "HUGO_genes = ge_data.index.str.extract(r'_([^_]+$)', expand=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "39953193"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(HUGO_genes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "ge_data['HUGO_gene']=HUGO_genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>transcript_id(s)</th>\n",
       "      <th>length</th>\n",
       "      <th>effective_length</th>\n",
       "      <th>expected_count</th>\n",
       "      <th>TPM</th>\n",
       "      <th>FPKM</th>\n",
       "      <th>sample</th>\n",
       "      <th>sequencing_type</th>\n",
       "      <th>tpm_id</th>\n",
       "      <th>HUGO_gene</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>ENSG00000000003.14_2_TSPAN6</th>\n",
       "      <td>ENST00000373020.8_1_TSPAN6-201,ENST00000494424...</td>\n",
       "      <td>1872.50</td>\n",
       "      <td>1630.51</td>\n",
       "      <td>14.0</td>\n",
       "      <td>0.29</td>\n",
       "      <td>0.20</td>\n",
       "      <td>MO_1008-Tumor_Dura</td>\n",
       "      <td>tcap</td>\n",
       "      <td>MO_1008-Tumor_Dura_tcap</td>\n",
       "      <td>TSPAN6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ENSG00000000005.6_3_TNMD</th>\n",
       "      <td>ENST00000373031.5_2_TNMD-201,ENST00000485971.1...</td>\n",
       "      <td>873.50</td>\n",
       "      <td>631.57</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>MO_1008-Tumor_Dura</td>\n",
       "      <td>tcap</td>\n",
       "      <td>MO_1008-Tumor_Dura_tcap</td>\n",
       "      <td>TNMD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ENSG00000000419.12_3_DPM1</th>\n",
       "      <td>ENST00000371582.8_2_DPM1-201,ENST00000371584.8...</td>\n",
       "      <td>969.60</td>\n",
       "      <td>727.62</td>\n",
       "      <td>514.0</td>\n",
       "      <td>24.09</td>\n",
       "      <td>16.65</td>\n",
       "      <td>MO_1008-Tumor_Dura</td>\n",
       "      <td>tcap</td>\n",
       "      <td>MO_1008-Tumor_Dura_tcap</td>\n",
       "      <td>DPM1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ENSG00000000457.14_4_SCYL3</th>\n",
       "      <td>ENST00000367770.5_2_SCYL3-201,ENST00000367771....</td>\n",
       "      <td>3087.03</td>\n",
       "      <td>2845.03</td>\n",
       "      <td>155.0</td>\n",
       "      <td>1.86</td>\n",
       "      <td>1.28</td>\n",
       "      <td>MO_1008-Tumor_Dura</td>\n",
       "      <td>tcap</td>\n",
       "      <td>MO_1008-Tumor_Dura_tcap</td>\n",
       "      <td>SCYL3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ENSG00000000460.17_6_C1orf112</th>\n",
       "      <td>ENST00000286031.10_4_C1orf112-201,ENST00000359...</td>\n",
       "      <td>3064.19</td>\n",
       "      <td>2822.19</td>\n",
       "      <td>87.0</td>\n",
       "      <td>1.05</td>\n",
       "      <td>0.73</td>\n",
       "      <td>MO_1008-Tumor_Dura</td>\n",
       "      <td>tcap</td>\n",
       "      <td>MO_1008-Tumor_Dura_tcap</td>\n",
       "      <td>C1orf112</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                transcript_id(s)  \\\n",
       "gene_id                                                                            \n",
       "ENSG00000000003.14_2_TSPAN6    ENST00000373020.8_1_TSPAN6-201,ENST00000494424...   \n",
       "ENSG00000000005.6_3_TNMD       ENST00000373031.5_2_TNMD-201,ENST00000485971.1...   \n",
       "ENSG00000000419.12_3_DPM1      ENST00000371582.8_2_DPM1-201,ENST00000371584.8...   \n",
       "ENSG00000000457.14_4_SCYL3     ENST00000367770.5_2_SCYL3-201,ENST00000367771....   \n",
       "ENSG00000000460.17_6_C1orf112  ENST00000286031.10_4_C1orf112-201,ENST00000359...   \n",
       "\n",
       "                                length  effective_length  expected_count  \\\n",
       "gene_id                                                                    \n",
       "ENSG00000000003.14_2_TSPAN6    1872.50           1630.51            14.0   \n",
       "ENSG00000000005.6_3_TNMD        873.50            631.57             0.0   \n",
       "ENSG00000000419.12_3_DPM1       969.60            727.62           514.0   \n",
       "ENSG00000000457.14_4_SCYL3     3087.03           2845.03           155.0   \n",
       "ENSG00000000460.17_6_C1orf112  3064.19           2822.19            87.0   \n",
       "\n",
       "                                 TPM   FPKM              sample  \\\n",
       "gene_id                                                           \n",
       "ENSG00000000003.14_2_TSPAN6     0.29   0.20  MO_1008-Tumor_Dura   \n",
       "ENSG00000000005.6_3_TNMD        0.00   0.00  MO_1008-Tumor_Dura   \n",
       "ENSG00000000419.12_3_DPM1      24.09  16.65  MO_1008-Tumor_Dura   \n",
       "ENSG00000000457.14_4_SCYL3      1.86   1.28  MO_1008-Tumor_Dura   \n",
       "ENSG00000000460.17_6_C1orf112   1.05   0.73  MO_1008-Tumor_Dura   \n",
       "\n",
       "                              sequencing_type                   tpm_id  \\\n",
       "gene_id                                                                  \n",
       "ENSG00000000003.14_2_TSPAN6              tcap  MO_1008-Tumor_Dura_tcap   \n",
       "ENSG00000000005.6_3_TNMD                 tcap  MO_1008-Tumor_Dura_tcap   \n",
       "ENSG00000000419.12_3_DPM1                tcap  MO_1008-Tumor_Dura_tcap   \n",
       "ENSG00000000457.14_4_SCYL3               tcap  MO_1008-Tumor_Dura_tcap   \n",
       "ENSG00000000460.17_6_C1orf112            tcap  MO_1008-Tumor_Dura_tcap   \n",
       "\n",
       "                              HUGO_gene  \n",
       "gene_id                                  \n",
       "ENSG00000000003.14_2_TSPAN6      TSPAN6  \n",
       "ENSG00000000005.6_3_TNMD           TNMD  \n",
       "ENSG00000000419.12_3_DPM1          DPM1  \n",
       "ENSG00000000457.14_4_SCYL3        SCYL3  \n",
       "ENSG00000000460.17_6_C1orf112  C1orf112  "
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ge_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "ge_data_filtered = ge_data.join(coding_genes_names, how='inner', on='HUGO_gene')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(12480801, 10)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ge_data_filtered.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = join(PROSTATE_DATA_PATH,'processed/outputs_su2c_tcga_all_samples_n=980_rsem_results_filtered.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "ge_data_filtered.to_csv(filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:min_env]",
   "language": "python",
   "name": "conda-env-min_env-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
